import pandas as pd
import plotly.express as px
Check Plotly's version.
import plotly
print(f'plotly version={plotly.__version__}')
plotly version=5.18.0
df = pd.read_csv('https://github.com/subwaymatch/mba562-dataviz-demos/raw/main/dataviz-tutorials/superstore_data.csv')
df
| Row.ID | Order.Date | Sales | Category | Sub.Category | Region | |
|---|---|---|---|---|---|---|
| 0 | 1 | 11/8/16 | 261.9600 | Furniture | Bookcases | South |
| 1 | 2 | 11/8/16 | 731.9400 | Furniture | Chairs | South |
| 2 | 3 | 6/12/16 | 14.6200 | Office Supplies | Labels | West |
| 3 | 4 | 10/11/15 | 957.5775 | Furniture | Tables | South |
| 4 | 5 | 10/11/15 | 22.3680 | Office Supplies | Storage | South |
| ... | ... | ... | ... | ... | ... | ... |
| 9989 | 9990 | 1/21/14 | 25.2480 | Furniture | Furnishings | South |
| 9990 | 9991 | 2/26/17 | 91.9600 | Furniture | Furnishings | West |
| 9991 | 9992 | 2/26/17 | 258.5760 | Technology | Phones | West |
| 9992 | 9993 | 2/26/17 | 29.6000 | Office Supplies | Paper | West |
| 9993 | 9994 | 5/4/17 | 243.1600 | Office Supplies | Appliances | West |
9994 rows × 6 columns
Convert the "Order.Date" column into a Datetime type.
df['Order.Date'] = pd.to_datetime(
df['Order.Date'],
format='%m/%d/%y'
)
df.head()
| Row.ID | Order.Date | Sales | Category | Sub.Category | Region | |
|---|---|---|---|---|---|---|
| 0 | 1 | 2016-11-08 | 261.9600 | Furniture | Bookcases | South |
| 1 | 2 | 2016-11-08 | 731.9400 | Furniture | Chairs | South |
| 2 | 3 | 2016-06-12 | 14.6200 | Office Supplies | Labels | West |
| 3 | 4 | 2015-10-11 | 957.5775 | Furniture | Tables | South |
| 4 | 5 | 2015-10-11 | 22.3680 | Office Supplies | Storage | South |
Check the data types of each column.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row.ID 9994 non-null int64 1 Order.Date 9994 non-null datetime64[ns] 2 Sales 9994 non-null float64 3 Category 9994 non-null object 4 Sub.Category 9994 non-null object 5 Region 9994 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(3) memory usage: 468.6+ KB
We’ll begin with a line chart that shows how sales have trended over time. This is useful to identify seasonality or trends.
First, create an aggregated DataFrame that shows the total sales by month.
df['YearMonth'] = df['Order.Date'].dt.strftime('%Y-%m')
df_sales_by_month = df.groupby(by='YearMonth', as_index=False) \
.agg({'Sales': 'sum'})
df_sales_by_month.head()
| YearMonth | Sales | |
|---|---|---|
| 0 | 2014-01 | 14236.895 |
| 1 | 2014-02 | 4519.892 |
| 2 | 2014-03 | 55691.009 |
| 3 | 2014-04 | 28295.345 |
| 4 | 2014-05 | 23648.287 |
fig = px.line(
df_sales_by_month,
x='YearMonth',
y='Sales',
template='simple_white'
)
fig.show()
The sales peak end of 2017. We can explore whether this is driven by a specific category or region. Imagine you are a category manager and want to plot the sales for various cateories instead.
First, let's plot line chart again and then see if there are better ways to show category sales:
df_sales_by_month_category = df.groupby(
by=['YearMonth', 'Category'],
as_index=False
).agg({'Sales': 'sum'})
df_sales_by_month_category.head()
| YearMonth | Category | Sales | |
|---|---|---|---|
| 0 | 2014-01 | Furniture | 6242.525 |
| 1 | 2014-01 | Office Supplies | 4851.080 |
| 2 | 2014-01 | Technology | 3143.290 |
| 3 | 2014-02 | Furniture | 1839.658 |
| 4 | 2014-02 | Office Supplies | 1071.724 |
fig = px.line(
df_sales_by_month_category,
x='YearMonth',
y='Sales',
color='Category',
template='simple_white'
)
fig.show()
df_sales_by_category = df.groupby(
by=['Category'],
as_index=False
).agg({'Sales': 'sum'})
# sort by sales in descending order
df_sales_by_category.sort_values(
by='Sales',
ascending=False,
inplace=True
)
df_sales_by_category.head()
| Category | Sales | |
|---|---|---|
| 2 | Technology | 836154.0330 |
| 0 | Furniture | 741999.7953 |
| 1 | Office Supplies | 719047.0320 |
fig = px.bar(
df_sales_by_category,
x='Category',
y='Sales',
color='Category',
template='simple_white'
)
fig.show()
fig = px.pie(
df_sales_by_category,
names='Category',
values='Sales',
color='Category',
template='simple_white'
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
Another type of chart that displays relativity is a treemap.
fig = px.treemap(
df_sales_by_category,
path=['Category'],
names='Category',
values='Sales',
template='simple_white',
)
fig.show()
Add custom labels.
labels = [f"{row['Category']}<br>{'${:,.0f}'.format(row['Sales'])}" for _, row in df_sales_by_category.iterrows()]
fig = px.treemap(
df_sales_by_category,
path=['Category'],
names='Category',
values='Sales',
template='simple_white',
)
fig.update_traces(labels=labels)
fig.show()